{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sqlalchemy import create_engine, MetaData\n",
    "from sqlalchemy.orm import sessionmaker\n",
    "from geoalchemy2 import Geometry\n",
    "from mapzen.api import MapzenAPI\n",
    "import glob\n",
    "import sys\n",
    "import os\n",
    "import time\n",
    "import pandas as pd\n",
    "sys.path.insert(0, os.path.realpath('..'))\n",
    "import django\n",
    "django.setup()\n",
    "from django.db import connections\n",
    "pd.set_option('display.max_rows', 3000)\n",
    "from IPython.display import display\n",
    "from firecares.firestation.models import FireDepartment\n",
    "import editdistance\n",
    "from IPython.lib.pretty import pprint\n",
    "\n",
    "engine = create_engine('postgresql://firecares:password@localhost:5432/geocoding_afg')\n",
    "conn = engine.connect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### entering the danger zone!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "conn.execute('drop table if exists to_geocode;')\n",
    "tot = 0\n",
    "for f in glob.glob('*.xls'):\n",
    "    df = pd.read_excel(f)\n",
    "    n = len(df)\n",
    "    tot += n\n",
    "    print 'Loading: {} records'.format(n)\n",
    "    df.to_sql('to_geocode', con=engine, if_exists='append')\n",
    "\n",
    "assert conn.execute('select count(1) from to_geocode;').fetchone()[0] == tot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "conn.execute('alter table to_geocode add column geom geometry(MultiPolygon, 4326);')\n",
    "conn.execute('alter table to_geocode add column status integer;')\n",
    "conn.execute('alter table to_geocode add column coverage geometry(MultiPolygon, 4326);')\n",
    "conn.execute('ALTER TABLE to_geocode ADD COLUMN id SERIAL PRIMARY KEY;')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### out of the danger zone"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "meta = MetaData()\n",
    "from sqlalchemy.ext.automap import automap_base\n",
    "\n",
    "Base = automap_base()\n",
    "Base.prepare(engine, reflect=True)\n",
    "maker = sessionmaker(bind=engine)\n",
    "\n",
    "ToGeocode = Base.classes['to_geocode']\n",
    "\n",
    "from contextlib import contextmanager\n",
    "\n",
    "@contextmanager\n",
    "def session_scope():\n",
    "    \"\"\"Provide a transactional scope around a series of operations.\"\"\"\n",
    "    session = maker()\n",
    "    try:\n",
    "        yield session\n",
    "        session.commit()\n",
    "    except:\n",
    "        session.rollback()\n",
    "        raise\n",
    "    finally:\n",
    "        session.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "states = pd.read_csv('states.csv')\n",
    "\n",
    "def get_state_abbrev(name):\n",
    "    try:\n",
    "        return states[states['State'] == name]['Abbreviation'].values[0]\n",
    "    except:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fd_mappings = {}\n",
    "\n",
    "def massage_fdid(fdid, state):\n",
    "    import re\n",
    "    if fdid and state:\n",
    "        if state in fd_mappings and fdid in fd_mappings[state]:\n",
    "            return fd_mappings[state][fdid]\n",
    "        stripped = re.sub('-|\\s+', '', fdid)\n",
    "        if len(stripped) < 5 and stripped:\n",
    "            return fdid.zfill(5)\n",
    "        elif len(stripped) > 5 and stripped.lower().startswith(state.lower()):\n",
    "            return re.sub('^{}'.format(state), '', stripped)\n",
    "        else:\n",
    "            return stripped\n",
    "    else:\n",
    "        return ''\n",
    "\n",
    "def fuzzy_find_department(fdid, state):\n",
    "    massaged_fdid = massage_fdid(fdid, state)\n",
    "    if massaged_fdid:\n",
    "        fds = FireDepartment.objects.filter(fdid__iexact=massaged_fdid, state__iexact=state)\n",
    "        if fds.exists():\n",
    "            if len(fds) > 1:\n",
    "                print 'Multiple matches for {} - {}'.format(fdid, state)\n",
    "                return None, None\n",
    "            return fds.first(), None\n",
    "        else:\n",
    "            return None, None\n",
    "    else:\n",
    "        return None, None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with session_scope() as session:\n",
    "    fds = session.query(ToGeocode).filter_by(status=None).all()\n",
    "    i = 0\n",
    "\n",
    "    for fd in fds:\n",
    "        i += 1\n",
    "        if i % 100 == 0:\n",
    "            print 'Only {} left'.format(float(session.query(ToGeocode).count()) - session.query(ToGeocode).filter(ToGeocode.status != None).count())\n",
    "        if fd.FDIN and fd.STATE:\n",
    "            hit, _ = fuzzy_find_department(fd.FDIN, get_state_abbrev(fd.STATE))\n",
    "            if hit:\n",
    "                fd.geom = hit.geom.ewkt if hit.geom else None\n",
    "                fd.coverage = hit.owned_tracts_geom.ewkt if hit.owned_tracts_geom else None\n",
    "                fd.status = 1\n",
    "                session.add(fd)\n",
    "                session.commit()\n",
    "            else:\n",
    "                print 'Miss on: {} {}'.format(fd.FDIN, get_state_abbrev(fd.STATE))\n",
    "                fd.status = 0\n",
    "                session.add(fd)\n",
    "                session.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Double-check to make SURE that the grant IDs have the correct year since we're extracting the year into its own col\n",
    "\n",
    "import glob\n",
    "import re\n",
    "\n",
    "for fn in glob.glob('*.xls'):\n",
    "    year = re.match('(\\d+).*', fn).groups()[0]\n",
    "    df = pd.read_excel(fn)\n",
    "    records = map(lambda x: x[1], df[['GRANT_ID']].to_records())\n",
    "    for r in records:\n",
    "        assert re.match('\\w+-(\\d+)-.*', r).groups()[0] == year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conn.execute('ALTER TABLE to_geocode ADD COLUMN year int;')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conn.execute('update to_geocode set year = cast(substring(\"GRANT_ID\" from 5 for 4) as int)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conn.execute(\"select count(1), year from to_geocode group by year\").fetchall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conn.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
